Data set has 985 Rows and 12 Columns.
Goal is to identify variables impacting price of the house in Secremento
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import sympy as sy
import pandas as pd
import pandas_profiling
import seaborn as sns
import mpl_toolkits
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
border-style: solid;
border: 3px solid lightgray;
}
</style>
data = pd.read_csv('https://raw.githubusercontent.com/jfkoehler/linear-regression-video/master/data/sacremento_housing.csv', index_col=0)
data.head()
data.shape
data.info()
data.sample(10)
data.describe(include = 'all')
data.city.value_counts().plot(kind='bar',figsize=(20,8))
pandas_profiling.ProfileReport(data)
Dropping duplicate rows
data = data.drop_duplicates(subset=None, keep='first', inplace=False)
Dropping few columns which will not impact our analysis
data = data.drop(['latitude', 'longitude', 'state', 'street', 'sale_date', 'zip','type'], axis=1)
data.rename(columns={'sq__ft':'sqft'}, inplace=True)
print(data.columns)
Quick check on missing values
def missing_data(data):
total = data.isnull().sum().sort_values(ascending = False)
percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data(data)
data.sample(5)
Removing bath/beds with zero values
data1 = data[data['city']== 'SACRAMENTO'] #Filter SACRAMENTO
data1 = data1[data1['baths']!= 0] #Removing zeros in bathroom and bedroom
data1 = data1[data1['beds']!= 0]
data1.baths.unique()
data1.beds.unique()
data1['sqft']=data1['sqft'].replace(0,data1['sqft'].median())
data1 = data1[data1['price']>= 0]
data1.info()
data1.sample(10)
data_s = data1[['price','sqft','beds','baths']]
pandas_profiling.ProfileReport(data_s)
data_s.info()
data_s.sample()
data_s['beds'].value_counts().plot(kind='bar')
plt.title('number of Bedroom')
plt.xlabel('Bedrooms')
plt.ylabel('Count')
sns.despine
plt.scatter(data_s.price,data_s.sqft)
plt.title("Price vs Square Feet")
sns.catplot("beds", "price", data=data_s, kind="bar" ,palette="PuBuGn_d",height=6, aspect=2)
plt.xlabel('Bed Room')
plt.ylabel('Price')
plt.show()
Something fishy with 6 & 8 bed rooms cant be so cheap, lets filter out as counts are negligible
data_s = data_s[data_s['beds']!= 6]
data_s = data_s[data_s['beds']!= 8]
data_s.beds.unique()
data_r = data_s.copy(deep=True) #ready for regression
data_r.sample(5)
data_r.describe()
f, axes = plt.subplots(2, 2, figsize=(7, 7), sharex=True) # Set up the matplotlib figure
sns.despine(left=True)
sns.distplot(data_r.price, color="b", ax=axes[0, 0])
sns.distplot(data_r.sqft, color="r", ax=axes[0, 1])
sns.distplot(data_r.beds, color="g", ax=axes[1, 0])
sns.distplot(data_r.baths, color="m", ax=axes[1, 1])
JG1 = sns.jointplot("sqft", "price", data=data_r, kind='reg')
JG2 = sns.jointplot("beds", "price", data=data_r, kind='reg')
JG3 = sns.jointplot("baths", "price", data=data_r, kind='reg')
#subplots migration
f = plt.figure()
for J in [JG1, JG2,JG3]:
for A in J.fig.axes:
f._axstack.add(f._make_key(A), A)
sns.pairplot(data_r, size = 2, aspect = 1.5)
sns.pairplot(data, x_vars=['sqft', 'beds', 'baths'], y_vars='price', size=5, aspect=1, kind='reg')
Observation
data_s.corr()
sns.heatmap( data_s.corr(), annot=True );
No Observations from correlation
Linear regression is a basic and commonly used type of predictive analysis. The overall idea of regression is to examine two things:
These regression estimates are used to explain the relationship between one dependent variable and one or more independent variables. The simplest form of the regression equation with one dependent and one independent variable is defined by the formula :
$y = \beta_0 + \beta_1x$

What does each term represent?
Three major uses for regression analysis are:
While taking errors into consideration the equation of linear regression is: Generally speaking, coefficients are estimated using the least squares criterion, which means we are find the line (mathematically) which minimizes the sum of squared residuals (or "sum of squared errors"):
How do the model coefficients relate to the least squares line?
Here is a graphical depiction of those calculations:
Preparing X and y using pandas
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(data_s)
dataR = scaler.transform(data_s)
dataR = pd.DataFrame(dataR)
dataR.head()
dataR.columns = ['price','sqft','bed','bath']
dataR.head()
dataR.info()
feature_cols = ['sqft', 'bed', 'bath'] # create a Python list of feature names
X = dataR[feature_cols]
# Splitting the dataset into training and test sets.
data_train, data_test = train_test_split(dataR, test_size = 0.2, random_state = 0)
# Dropping price from x_train and x_test matrices, and creating y_train and y_test vectors for price values.
x_train = data_train.drop(['price'], 1)
y_train = data_train['price']
x_test = data_test.drop(['price'], 1)
y_test = data_test['price']
from sklearn.model_selection import train_test_split
def split(X,y):
return train_test_split(X, y, test_size=0.20, random_state=1)
# Checking the shapes of training and test sets.
print('Shape of x_train: ', x_train.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of x_test: ', x_test.shape)
print('Shape of y_test: ', y_test.shape)
linreg = LinearRegression()
linreg.fit(x_train, y_train)
y_pred_train = linreg.predict(x_train)
y_pred_train[:10]
y_pred_test = linreg.predict(x_test)
y_pred_test[:10]
To apply any machine learning algorithm on your dataset, basically there are 4 steps:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train, y_train)
RMSE_test = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))
X_train, X_test, y_train, y_test=split(X,y)
print('Train cases as below')
print('X_train shape: ',X_train.shape)
print('y_train shape: ',y_train.shape)
print('\nTest cases as below')
print('X_test shape: ',X_test.shape)
print('y_test shape: ',y_test.shape)
def linear_reg( X, y, gridsearch = False):
X_train, X_test, y_train, y_test = split(X,y)
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
if not(gridsearch):
linreg.fit(X_train, y_train)
else:
from sklearn.model_selection import GridSearchCV
parameters = {'normalize':[True,False], 'copy_X':[True, False]}
linreg = GridSearchCV(linreg,parameters, cv = 10,refit = True)
linreg.fit(X_train, y_train) # fit the model to the training data (learn the coefficients)
print("Mean cross-validated score of the best_estimator : ", linreg.best_score_)
y_pred_test = linreg.predict(X_test) # make predictions on the testing set
RMSE_test = (metrics.mean_squared_error(y_test, y_pred_test)) # compute the RMSE of our predictions
print('RMSE for the test set is {}'.format(RMSE_test))
return linreg
Note: Linear Regression Model with GridSearcCV is implemented at Table of Contents: 8
X = dataR[feature_cols]
y = dataR.price
linreg = linear_reg(X,y)
print('Intercept:',linreg.intercept_) # print the intercept
print('Coefficients:',linreg.coef_)
feature_cols.insert(0,'Intercept')
coef = linreg.coef_.tolist()
coef.insert(0, linreg.intercept_)
eq1 = zip(feature_cols, coef)
for c1,c2 in eq1:
print(c1,c2)
Y = 0.01392 + (0.9193686 * SquareFoot) + (-0.22929 * Bedrooms) + (-0.073774 * Bathrooms)
We interpret the SquareFoot coefficient (0.9193686)
Price of the house is dependant on size of the house
y_pred_train = linreg.predict(X_train)
y_pred_train
y_pred_test = linreg.predict(X_test) # make predictions on the testing set
y_pred_test
Error is the deviation of the values predicted by the model with the true values.
For example, if a model predicts that the price of apple is Rs75/kg, but the actual price of apple is Rs100/kg, then the error in prediction will be Rs25/kg.
Below are the types of error we will be calculating for our linear regression model:
Mean Absolute Error (MAE) is the mean of the absolute value of the errors: $$\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|$$ Computing the MAE for our Pirce predictions
MAE_train = metrics.mean_absolute_error(y_train, y_pred_train)
MAE_test = metrics.mean_absolute_error(y_test, y_pred_test)
print('MAE for training set is {}'.format(MAE_train))
print('MAE for test set is {}'.format(MAE_test))
Mean Squared Error (MSE) is the mean of the squared errors: $$\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2$$
Computing the MSE for our Price predictions
MSE_train = metrics.mean_squared_error(y_train, y_pred_train)
MSE_test = metrics.mean_squared_error(y_test, y_pred_test)
print('MSE for training set is {}'.format(MSE_train))
print('MSE for test set is {}'.format(MSE_test))
Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors:
$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$Computing the RMSE for our Price predictions
RMSE_train = np.sqrt( metrics.mean_squared_error(y_train, y_pred_train))
RMSE_test = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))
print('RMSE for training set is {}'.format(RMSE_train))
print('RMSE for test set is {}'.format(RMSE_test))
Comparing these metrics:
R-squared is the proportion of variance explained, meaning the proportion of variance in the observed data that is explained by the model, or the reduction in error over the null model. (The null model just predicts the mean of the observed response, and thus it has an intercept and no slope.)
R-squared is between 0 and 1, and higher is better because it means that more variance is explained by the model. But there is one shortcoming of Rsquare method and that is R-squared will always increase as you add more features to the model, even if they are unrelated to the response. Thus, selecting the model with the highest R-squared is not a reliable approach for choosing the best linear model.
There is alternative to R-squared called adjusted R-squared that penalizes model complexity (to control for overfitting).
yhat = linreg.predict(X_train)
SS_Residual = sum((y_train-yhat)**2)
SS_Total = sum((y_train-np.mean(y_train))**2)
r_squared = 1 - (float(SS_Residual))/SS_Total
adjusted_r_squared = 1 - (1-r_squared)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
print(r_squared, adjusted_r_squared)
yhat = linreg.predict(X_test)
SS_Residual = sum((y_test-yhat)**2)
SS_Total = sum((y_test-np.mean(y_test))**2)
r_squared = 1 - (float(SS_Residual))/SS_Total
adjusted_r_squared = 1 - (1-r_squared)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
print(r_squared, adjusted_r_squared)
At times some features do not contribute much to the accuracy of the model, in that case its better to discard those features.
feature_cols = ['sqft','bath'] # create a Python list of feature names
X = dataR[feature_cols]
y = dataR.price
linreg=linear_reg(X,y, gridsearch=True)
feature_cols = ['sqft','bed'] # create a Python list of feature names
X = dataR[feature_cols]
y = dataR.price
linreg=linear_reg(X,y, gridsearch=True)